# Add any packages you want in this chunk:
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

We are going to look at the property prices for Orange County. From there we are going to try and predict the sales price by linear regression.

1 Importing Data

Bring in the data and make sure the data types are correct. If not, make the proper changes. The file is located within this project. data/prop_prices_reduced.csv

prop_prices <- read.csv("data\\prop_prices_reduced.csv")
head(prop_prices)
##   sale_def bed bath area_heated  area dist_cbd dist_lakes pool
## 1    88142   4    2        1270  3631  5211.83     638.57    0
## 2    78046   2    2        1037  1307 14706.67     198.90    0
## 3   273777   4    2        2821 10309 20897.89    3799.20    0
## 4   229185   4    3        2341  9578  7057.41     193.39    0
## 5   464029   4    3        2981 12287  7771.70     225.97    1
## 6   109152   3    2        1307 10088 18662.69     496.94    1
summary(prop_prices)
##     sale_def            bed             bath        area_heated   
##  Min.   :  15296   Min.   :1.000   Min.   :1.000   Min.   :  650  
##  1st Qu.: 115565   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 1387  
##  Median : 151085   Median :3.000   Median :2.000   Median : 1744  
##  Mean   : 200319   Mean   :3.398   Mean   :2.248   Mean   : 1955  
##  3rd Qu.: 213902   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.: 2301  
##  Max.   :7629992   Max.   :7.000   Max.   :8.000   Max.   :10796  
##       area           dist_cbd         dist_lakes           pool      
##  Min.   :  1307   Min.   :  598.2   Min.   :  16.74   Min.   :0.000  
##  1st Qu.:  6498   1st Qu.: 9701.7   1st Qu.: 314.86   1st Qu.:0.000  
##  Median :  8733   Median :14552.0   Median : 638.82   Median :0.000  
##  Mean   : 10565   Mean   :14382.9   Mean   :1403.54   Mean   :0.229  
##  3rd Qu.: 11250   3rd Qu.:19369.2   3rd Qu.:1735.35   3rd Qu.:0.000  
##  Max.   :269496   Max.   :36107.6   Max.   :8206.79   Max.   :1.000
prop_prices$lake_front <- prop_prices$dist_lakes
prop_prices$lake_front[prop_prices$lake_front < 100] = 1
prop_prices$lake_front[prop_prices$lake_front > 100] = 0

prop_prices$downtown <- prop_prices$dist_cbd
prop_prices$downtown[prop_prices$downtown < 5280] = 1
prop_prices$downtown[prop_prices$downtown > 5280] = 0

prop_prices$luxury = 0
prop_prices$luxury[prop_prices$bed > 4 & prop_prices$area_heated > 2500 & prop_prices$pool == 1] = 1


# prop_prices$bed <- factor(prop_prices$bed)

head(prop_prices)
##   sale_def bed bath area_heated  area dist_cbd dist_lakes pool lake_front
## 1    88142   4    2        1270  3631  5211.83     638.57    0          0
## 2    78046   2    2        1037  1307 14706.67     198.90    0          0
## 3   273777   4    2        2821 10309 20897.89    3799.20    0          0
## 4   229185   4    3        2341  9578  7057.41     193.39    0          0
## 5   464029   4    3        2981 12287  7771.70     225.97    1          0
## 6   109152   3    2        1307 10088 18662.69     496.94    1          0
##   downtown luxury
## 1        1      0
## 2        0      0
## 3        0      0
## 4        0      0
## 5        0      0
## 6        0      0
summary(prop_prices)
##     sale_def            bed             bath        area_heated   
##  Min.   :  15296   Min.   :1.000   Min.   :1.000   Min.   :  650  
##  1st Qu.: 115565   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 1387  
##  Median : 151085   Median :3.000   Median :2.000   Median : 1744  
##  Mean   : 200319   Mean   :3.398   Mean   :2.248   Mean   : 1955  
##  3rd Qu.: 213902   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.: 2301  
##  Max.   :7629992   Max.   :7.000   Max.   :8.000   Max.   :10796  
##       area           dist_cbd         dist_lakes           pool      
##  Min.   :  1307   Min.   :  598.2   Min.   :  16.74   Min.   :0.000  
##  1st Qu.:  6498   1st Qu.: 9701.7   1st Qu.: 314.86   1st Qu.:0.000  
##  Median :  8733   Median :14552.0   Median : 638.82   Median :0.000  
##  Mean   : 10565   Mean   :14382.9   Mean   :1403.54   Mean   :0.229  
##  3rd Qu.: 11250   3rd Qu.:19369.2   3rd Qu.:1735.35   3rd Qu.:0.000  
##  Max.   :269496   Max.   :36107.6   Max.   :8206.79   Max.   :1.000  
##    lake_front       downtown         luxury     
##  Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000  
##  Mean   :0.055   Mean   :0.089   Mean   :0.035  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000

2 Plotting

Plot histograms for all variables. Additionally, add scatterplots for the relationships between all quantitative variables.

  1. Plot of all variables
hist(prop_prices$sale_def)

hist(prop_prices$bed)

hist(prop_prices$bath)

hist(prop_prices$area_heated)

hist(prop_prices$area)

hist(prop_prices$dist_cbd)

hist(prop_prices$dist_lakes)

hist(prop_prices$pool)

hist(prop_prices$lake_front)

hist(prop_prices$downtown)

  1. Scatterplots for relationships between all quantitative variables

** Sale price against all quantitative variables

plot_ly(prop_prices, y = ~sale_def, x = ~area, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~area_heated, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~bed, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~dist_cbd, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~bath, type = 'scatter')
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot(prop_prices$area_heated, prop_prices$sale_def)
abline(lm(sale_def ~ area_heated, data=prop_prices))

plot(prop_prices$area, prop_prices$sale_def)
abline(lm(sale_def ~ area, data=prop_prices))

plot(prop_prices$bed, prop_prices$sale_def)
abline(lm(sale_def ~ bed, data=prop_prices))

plot(prop_prices$dist_cbd, prop_prices$sale_def)
abline(lm(sale_def ~ dist_cbd, data=prop_prices))

plot(prop_prices$dist_lakes, prop_prices$sale_def)
abline(lm(sale_def ~ dist_lakes, data=prop_prices))

plot(prop_prices$bath, prop_prices$sale_def)
abline(lm(sale_def ~ bath, data=prop_prices))

3 Summary Statistics

Provide basic summary statistics for univariate analysis. Also, provide the correlation between all the quantitative variables.

summary(prop_prices)
##     sale_def            bed             bath        area_heated   
##  Min.   :  15296   Min.   :1.000   Min.   :1.000   Min.   :  650  
##  1st Qu.: 115565   1st Qu.:3.000   1st Qu.:2.000   1st Qu.: 1387  
##  Median : 151085   Median :3.000   Median :2.000   Median : 1744  
##  Mean   : 200319   Mean   :3.398   Mean   :2.248   Mean   : 1955  
##  3rd Qu.: 213902   3rd Qu.:4.000   3rd Qu.:2.500   3rd Qu.: 2301  
##  Max.   :7629992   Max.   :7.000   Max.   :8.000   Max.   :10796  
##       area           dist_cbd         dist_lakes           pool      
##  Min.   :  1307   Min.   :  598.2   Min.   :  16.74   Min.   :0.000  
##  1st Qu.:  6498   1st Qu.: 9701.7   1st Qu.: 314.86   1st Qu.:0.000  
##  Median :  8733   Median :14552.0   Median : 638.82   Median :0.000  
##  Mean   : 10565   Mean   :14382.9   Mean   :1403.54   Mean   :0.229  
##  3rd Qu.: 11250   3rd Qu.:19369.2   3rd Qu.:1735.35   3rd Qu.:0.000  
##  Max.   :269496   Max.   :36107.6   Max.   :8206.79   Max.   :1.000  
##    lake_front       downtown         luxury     
##  Min.   :0.000   Min.   :0.000   Min.   :0.000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000  
##  Median :0.000   Median :0.000   Median :0.000  
##  Mean   :0.055   Mean   :0.089   Mean   :0.035  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.000
cor(prop_prices)
##                sale_def         bed        bath area_heated        area
## sale_def     1.00000000  0.32557109  0.58731596  0.69201080  0.34354392
## bed          0.32557109  1.00000000  0.64449531  0.66911599  0.10437035
## bath         0.58731596  0.64449531  1.00000000  0.83255359  0.19298319
## area_heated  0.69201080  0.66911599  0.83255359  1.00000000  0.30184325
## area         0.34354392  0.10437035  0.19298319  0.30184325  1.00000000
## dist_cbd     0.05263700  0.23328277  0.24220637  0.25494517  0.08648327
## dist_lakes  -0.08857844  0.03747888 -0.02109154 -0.04092947 -0.15432291
## pool         0.27749823  0.31299930  0.38722523  0.43899880  0.19200110
## lake_front   0.22917369  0.05615679  0.13473393  0.16065033  0.25378323
## downtown    -0.01388353 -0.18866237 -0.18778515 -0.14152623 -0.04345479
## luxury       0.34714139  0.41391226  0.41047700  0.42886762  0.16275070
##                dist_cbd  dist_lakes        pool  lake_front    downtown
## sale_def     0.05263700 -0.08857844  0.27749823  0.22917369 -0.01388353
## bed          0.23328277  0.03747888  0.31299930  0.05615679 -0.18866237
## bath         0.24220637 -0.02109154  0.38722523  0.13473393 -0.18778515
## area_heated  0.25494517 -0.04092947  0.43899880  0.16065033 -0.14152623
## area         0.08648327 -0.15432291  0.19200110  0.25378323 -0.04345479
## dist_cbd     1.00000000  0.26520451  0.11442899 -0.02851741 -0.55941934
## dist_lakes   0.26520451  1.00000000 -0.04295257 -0.18880720 -0.18538695
## pool         0.11442899 -0.04295257  1.00000000  0.06686163 -0.08676410
## lake_front  -0.02851741 -0.18880720  0.06686163  1.00000000 -0.01378706
## downtown    -0.55941934 -0.18538695 -0.08676410 -0.01378706  1.00000000
## luxury       0.11675950 -0.05119785  0.34944579  0.09725965 -0.04041648
##                  luxury
## sale_def     0.34714139
## bed          0.41391226
## bath         0.41047700
## area_heated  0.42886762
## area         0.16275070
## dist_cbd     0.11675950
## dist_lakes  -0.05119785
## pool         0.34944579
## lake_front   0.09725965
## downtown    -0.04041648
## luxury       1.00000000

4 Regression Analysis

Run a regression with all the variables included. Print results of the regression.

simple_reg = glm(sale_def ~ bed + bath + area + area_heated + dist_cbd + dist_lakes + pool + lake_front + downtown + luxury, data=prop_prices)
summary(simple_reg) 
## 
## Call:
## glm(formula = sale_def ~ bed + bath + area + area_heated + dist_cbd + 
##     dist_lakes + pool + lake_front + downtown + luxury, data = prop_prices)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -745210   -53773     9862    65030  4866204  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.556e+04  3.571e+04  -0.716 0.474334    
## bed         -9.121e+04  1.096e+04  -8.322 2.84e-16 ***
## bath         4.902e+04  1.500e+04   3.267 0.001123 ** 
## area         3.028e+00  6.006e-01   5.042 5.48e-07 ***
## area_heated  2.382e+02  1.425e+01  16.717  < 2e-16 ***
## dist_cbd    -5.519e+00  1.260e+00  -4.381 1.31e-05 ***
## dist_lakes   3.452e+00  3.807e+00   0.907 0.364731    
## pool        -3.886e+04  1.651e+04  -2.354 0.018761 *  
## lake_front   9.549e+04  2.816e+04   3.391 0.000723 ***
## downtown     8.418e+03  2.615e+04   0.322 0.747586    
## luxury       1.532e+05  3.835e+04   3.994 6.98e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 37029720732)
## 
##     Null deviance: 8.3182e+13  on 999  degrees of freedom
## Residual deviance: 3.6622e+13  on 989  degrees of freedom
## AIC: 27186
## 
## Number of Fisher Scoring iterations: 2
coef(simple_reg) 
##   (Intercept)           bed          bath          area   area_heated 
## -25557.393538 -91214.859441  49022.220000      3.028256    238.178270 
##      dist_cbd    dist_lakes          pool    lake_front      downtown 
##     -5.519482      3.452146 -38857.531660  95492.372704   8418.297692 
##        luxury 
## 153157.610459

Which of the variables tested significant at the 95% level? Looking at the results and answering outside of the chunk is sufficient.

Answer: Pool tested significant at the 95% confidence level.

4.1 Evaluating the model

As is, are any of the Gauss-Markov assumptions violated? If so, which ones? How can you fix the issues?

Using the plot of the regression model to test for Guass-Markov violations.

The Residuals vs Fitted plot shows that there is increasing variance as price increases indicating bias which violates assumption 1. This plot also show that there is no linearity which is also a violation.

The Q-Q plot does not show a diagnol line indicating that the data is not normally distributed.

plot(simple_reg)

4.2 New Model

Based off of your findings in the previous section, make changes to the variables, the functional form, etc.

# Try removing outliers
normalized <- subset(prop_prices, prop_prices$sale_def < 1000000)

# Use only the most significant variables in the formula
fixed_reg = glm(sale_def ~ bed + area_heated + lake_front + luxury, data=normalized)
summary(fixed_reg) 
## 
## Call:
## glm(formula = sale_def ~ bed + area_heated + lake_front + luxury, 
##     data = normalized)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -261811   -33253    -3773    22079   588465  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8472.054  10191.716   0.831   0.4060    
## bed         -27164.364   3827.722  -7.097 2.44e-12 ***
## area_heated    137.089      3.938  34.816  < 2e-16 ***
## lake_front   45965.092   9714.313   4.732 2.55e-06 ***
## luxury       28550.211  13518.349   2.112   0.0349 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 4530760193)
## 
##     Null deviance: 1.3150e+13  on 990  degrees of freedom
## Residual deviance: 4.4673e+12  on 986  degrees of freedom
## AIC: 24853
## 
## Number of Fisher Scoring iterations: 2
coef(fixed_reg) 
## (Intercept)         bed area_heated  lake_front      luxury 
##   8472.0537 -27164.3638    137.0894  45965.0920  28550.2113
plot(fixed_reg)

# Looking at the plots, it seems like a better fit

# Try using log of large values to even the playing field
fixed_reg2 = glm(log(sale_def) ~ bed + log(area_heated) + lake_front + luxury, data=normalized)
summary(fixed_reg2) 
## 
## Call:
## glm(formula = log(sale_def) ~ bed + log(area_heated) + lake_front + 
##     luxury, data = normalized)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.68626  -0.16516  -0.00893   0.15101   1.12932  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.00553    0.22811  13.176  < 2e-16 ***
## bed              -0.08843    0.01698  -5.209 2.31e-07 ***
## log(area_heated)  1.23604    0.03515  35.166  < 2e-16 ***
## lake_front        0.16225    0.04077   3.979 7.42e-05 ***
## luxury            0.13410    0.05606   2.392   0.0169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.07978786)
## 
##     Null deviance: 252.224  on 990  degrees of freedom
## Residual deviance:  78.671  on 986  degrees of freedom
## AIC: 313.7
## 
## Number of Fisher Scoring iterations: 2
coef(fixed_reg2) 
##      (Intercept)              bed log(area_heated)       lake_front 
##        3.0055337       -0.0884263        1.2360374        0.1622502 
##           luxury 
##        0.1340966
plot(fixed_reg2)

5 Prediction

Based on the following inputs, predict the deflated sales price:

  • 2 bed
  • 2 bath
  • area_heated = 1223
  • area = 9750
  • dist_cbd = 19368
  • dist_lakes = 490
  • no pool
# Using basic model
new_house_df <- data.frame(bed=2,bath=2,area_heated=1223,area=9750,dist_cbd=19368,dist_lakes=490,pool=0,lake_front=0,luxury=0)
basic_linear_model <- lm(formula = sale_def ~ bed + bath + area + area_heated + dist_cbd + 
    dist_lakes + pool, data = prop_prices)
predict(basic_linear_model, newdata=new_house_df)
##        1 
## 100339.2
# Now using fixed models
predict(fixed_reg, newdata=new_house_df)
##        1 
## 121803.7
# Need to take e^predication since fixed_reg2 is based on logs
exp(predict(fixed_reg2, newdata=new_house_df))
##        1 
## 110829.5